iT邦幫忙

2025 iThome 鐵人賽

DAY 10
0
生成式 AI

練習AI系列 第 11

多模態應用整合!

  • 分享至 

  • xImage
  •  

🆕 新增/修改的程式碼

  1. src/utils/summarize.js(新增)

通用摘要器:可直接餵 Day 8 逐字稿或任何長文,會輸出精煉摘要+章節重點。

// src/utils/summarize.js
import { openai } from "../aiClient.js";
import { PromptBuilder } from "../promptBuilder.js";

/**

  • 將長文摘要成易唸的稿(適合 TTS)
  • @param {string} text - 原文(可很長)
  • @param {object} opts
  • @param {string} [opts.tone="friendly"] - 語氣:friendly | professional
  • @param {("short"|"medium"|"long")} [opts.length="medium"]
  • @returns {Promise<{summary:string, outline:string[]}>}
    */
    export async function summarizeForAudio(text, opts = {}) {
    const { tone = "friendly", length = "medium" } = opts;

const pb = new PromptBuilder()
.setRole("你是專業的中文內容編輯與配音腳本撰寫者")
.setGoal("將長文整理成適合口語朗讀的稿件,並提供 4~8 條節點大綱")
.addConstraint("語句自然,避免過長句;易於 TTS 朗讀")
.addConstraint("避免虛構資訊;保留關鍵數據與結論")
.addConstraint("輸出包含:『摘要稿』與『大綱列表』兩部分,以 JSON 格式")
.setFormatHint(長度提示:${length === "short" ? "約 120~180 字" : length === "long" ? "約 600~900 字" : "約 250~400 字"},語氣:${tone})
.setUserInput(text);

const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.4,
messages: [
{ role: "system", content: pb.buildSystemPrompt() },
{ role: "user", content: "請以純 JSON 回覆:{"summary":"...","outline":["...", ...]}" }
]
});

const raw = res.choices?.[0]?.message?.content?.trim() || "{}";
// 容錯解析
const json = raw.match(/(?:json)?\s*([\s\S]*?)/i)?.[1] ?? raw;
const obj = JSON.parse(json);
if (!obj.summary || !Array.isArray(obj.outline)) {
throw new Error("摘要結果格式不正確");
}
return obj;
}

  1. src/day10_multimodal.js(新增)

兩條管線皆會把中繼內容與最終音檔落在 outputs/mm//...。

// src/day10_multimodal.js
import fs from "fs";
import path from "path";
import { imageToJson } from "./day7_image_to_text.js";
import { speak } from "./day9_text_to_speech.js";
import { transcribe } from "./day8_speech_to_text.js";
import { summarizeForAudio } from "./utils/summarize.js";

function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}

/**

  • 管線一:Photo Narration
  • 圖片 → 描述(JSON) → 語音檔
  • @param {object} opts
  • @param {string} [opts.imagePath] - 本地檔
  • @param {string} [opts.imageUrl] - 遠端 URL
  • @param {boolean} [opts.wantOCR=false]
  • @param {("short"|"medium"|"long")} [opts.length="medium"]
  • @param {string} [opts.voice="alloy"] - TTS 聲線
  • @param {string} [opts.format="mp3"]
  • @returns {Promise<{jsonPath:string, audioPath:string}>}
    */
    export async function photoNarration(opts = {}) {
    const { imagePath, imageUrl, wantOCR = false, length = "medium", voice = "alloy", format = "mp3" } = opts;
    const outDir = path.join("outputs", "mm", "photo_narration");
    ensureDir(outDir);

// 1) 圖片 → JSON 描述
const desc = await imageToJson({ imagePath, imageUrl, wantOCR, length });
const jsonPath = path.join(outDir, desc_${Date.now()}.json);
fs.writeFileSync(jsonPath, JSON.stringify(desc, null, 2), "utf-8");

// 2) JSON → 口播稿(把 title + description 合併成易唸文本)
const script = ${desc.title}。${desc.description};

// 3) 口播稿 → TTS
const { filepath } = await speak({
text: script,
voice,
format,
outputDir: outDir,
filename: narration_${Date.now()}
});

return { jsonPath, audioPath: filepath };
}

/**

  • 管線二:Meeting-to-Podcast
  • 音檔 → 逐字稿 →(摘要/口語化)→ 語音檔
  • @param {object} opts
  • @param {string} [opts.filePath] - 本地音檔
  • @param {string} [opts.url] - 遠端音檔 URL
  • @param {string} [opts.lang="zh"]
  • @param {string} [opts.prompt] - 術語/人名提示
  • @param {("short"|"medium"|"long")} [opts.length="medium"]
  • @param {string} [opts.voice="aria"]
  • @param {string} [opts.format="mp3"]
  • @returns {Promise<{transcriptPath:string, summaryPath:string, audioPath:string}>}
    */
    export async function meetingToPodcast(opts = {}) {
    const { filePath, url, lang = "zh", prompt = "", length = "medium", voice = "aria", format = "mp3" } = opts;
    const outDir = path.join("outputs", "mm", "meeting_podcast");
    ensureDir(outDir);

// 1) STT:音檔 → 逐字稿
const { text, saved } = await transcribe({ filePath, url, language: lang, prompt, detailed: false });

// 2) 長文 → 口語化摘要(適合朗讀)
const { summary, outline } = await summarizeForAudio(text, { tone: "friendly", length });
const summaryObj = { outline, summary };
const baseName = path.basename(saved.txt).replace(/.[^.]+$/, "");
const summaryPath = path.join(outDir, ${baseName}_summary.json);
fs.writeFileSync(summaryPath, JSON.stringify(summaryObj, null, 2), "utf-8");

// 3) 摘要 → TTS(Podcast 口吻)
const script = 這是今天會議的口語化摘要。${summary};
const { filepath } = await speak({
text: script,
voice,
format,
outputDir: outDir,
filename: ${baseName}_podcast
});

return { transcriptPath: saved.txt, summaryPath, audioPath: filepath };
}

  1. index.js(修改:加入多模態入口)
    // index.js
    import { englishTeacher, codeReview, sentimentClassify } from "./src/day3_prompt_engineering.js";
    import { newsToJson } from "./src/day4_text_to_json.js";
    import { chatOnce, resetSession } from "./src/day5_chat_history.js";
    import { textToImage } from "./src/day6_text_to_image.js";
    import { imageToJson } from "./src/day7_image_to_text.js";
    import { transcribe } from "./src/day8_speech_to_text.js";
    import { speak, speakFromFile } from "./src/day9_text_to_speech.js";
    import { photoNarration, meetingToPodcast } from "./src/day10_multimodal.js";

const args = Object.fromEntries(
process.argv.slice(2).reduce((acc, cur, i, arr) => {
if (cur.startsWith("--")) {
const key = cur.replace(/^--/, "");
const val = arr[i + 1] && !arr[i + 1].startsWith("--") ? arr[i + 1] : true;
acc.push([key, val]);
}
return acc;
}, [])
);

async function main() {
const task = args.task || "chat";

if (task === "mm") {
const mode = args.mode || "photo_narration";
if (mode === "photo_narration") {
const out = await photoNarration({
imagePath: args.imagePath || null,
imageUrl: args.imageUrl || null,
wantOCR: args.ocr === "true" || args.ocr === true,
length: args.length || "medium",
voice: args.voice || "alloy",
format: args.format || "mp3",
});
console.log("\n=== Photo Narration ===");
console.log("描述 JSON:", out.jsonPath);
console.log("語音檔:", out.audioPath);

} else if (mode === "transcript_podcast") {
  const out = await meetingToPodcast({
    filePath: args.filePath || null,
    url: args.url || null,
    lang: args.lang || "zh",
    prompt: args.prompt || "",
    length: args.length || "medium",
    voice: args.voice || "aria",
    format: args.format || "mp3",
  });
  console.log("\n=== Meeting → Podcast ===");
  console.log("逐字稿 TXT:", out.transcriptPath);
  console.log("摘要 JSON:", out.summaryPath);
  console.log("Podcast 語音:", out.audioPath);

} else {
  console.log("未知模式,請使用 --mode photo_narration | transcript_podcast");
}

} else if (task === "tts") {
const text = args.text || "";
const file = args.file || "";
const model = args.model || process.env.OPENAI_TTS_MODEL || "gpt-4o-mini-tts";
const voice = args.voice || "alloy";
const format = args.format || "mp3";
const speed = args.speed ? Number(args.speed) : 1.0;
const filename = args.out || undefined;
if (file) {
const { filepath, bytes } = await speakFromFile(file, { model, voice, format, speed, filename });
console.log("\n=== 文字檔 → 語音 ===");
console.log("輸出:", filepath, (${bytes} bytes));
} else {
const content = text || "這是一段測試用的語音。";
const { filepath, bytes } = await speak({ text: content, model, voice, format, speed, filename });
console.log("\n=== 文字 → 語音 ===");
console.log("輸出:", filepath, (${bytes} bytes));
}

} else if (task === "stt") {
const filePath = args.filePath || null;
const url = args.url || null;
const language = args.lang || "";
const prompt = args.prompt || "";
const detailed = args.detailed === "true" || args.detailed === true;
const { text, saved } = await transcribe({ filePath, url, language, prompt, detailed });
console.log("\n=== 語音轉文字(STT) ===\n");
console.log(text);
console.log("\n已儲存:", saved);

} else if (task === "vision") {
const imagePath = args.imagePath || null;
const imageUrl = args.imageUrl || null;
const wantOCR = args.ocr === "true" || args.ocr === true;
const length = args.length || "medium";
const out = await imageToJson({ imagePath, imageUrl, wantOCR, length });
console.log("\n=== 圖片 → JSON 描述 ===\n");
console.log(JSON.stringify(out, null, 2));

} else if (task === "image") {
const prompt = args.text || "一隻戴著太空頭盔的柴犬,漂浮在月球上,插著台灣國旗";
const size = args.size || "512x512";
const n = args.n ? Number(args.n) : 1;
const urls = await textToImage(prompt, { size, n });
console.log("\n=== 生成圖片 ===\n");
urls.forEach((f) => console.log("已儲存:" + f));

} else if (task === "chat") {
const sessionId = args.session || "default";
if (args.reset) {
resetSession(sessionId);
console.log(已重設會話:${sessionId});
return;
}
const input = args.text || "嗨,我想規劃 3 天 2 夜的台中旅遊行程。";
const { reply } = await chatOnce(input, { sessionId });
console.log(\n[${sessionId}] AI:\n${reply}\n);

} else if (task === "teacher") {
const out = await englishTeacher(args.text || "He go to school every day.");
console.log("\n=== 英文老師 ===\n");
console.log(out);

} else if (task === "review") {
const out = await codeReview("function sum(arr){ return arr.reduce((a,b)=>a+b,0) }");
console.log("\n=== 程式碼審查 ===\n");
console.log(out);

} else if (task === "sentiment") {
const out = await sentimentClassify(args.text || "今天心情糟透了,事情一團亂。");
console.log("\n=== 情緒分類(JSON) ===\n");
console.log(out);

} else if (task === "json_summary") {
const out = await newsToJson(args.text || "OpenAI 發布新模型,效能大幅提升。");
console.log("\n=== 新聞 JSON 摘要 ===\n");
console.log(out);

} else {
console.log("未知任務,請使用 --task mm | tts | stt | vision | image | chat | teacher | review | sentiment | json_summary");
}
}

main().catch((e) => {
console.error("發生錯誤:", e.message);
process.exit(1);
});

  1. package.json(新增 Scripts)

保留你原有的 scripts,只新增以下:

{
"scripts": {
"day10:photo": "node index.js --task mm --mode photo_narration --imageUrl https://images.unsplash.com/photo-1519681393784-d120267933ba --voice alloy --format mp3",
"day10:podcast": "node index.js --task mm --mode transcript_podcast --url https://example.com/demo.m4a --lang zh --prompt "專案:SmartGo Plus;術語:RAG、LIFF、IIS" --voice aria --format mp3"
}
}

▶️ 如何執行(CLI)

1) 圖片→描述→配音

npm run day10:photo --silent

生成於:

outputs/mm/photo_narration/desc_.json

outputs/mm/photo_narration/narration_.mp3

2) 會議→逐字稿→摘要→Podcast 配音

npm run day10:podcast --silent

生成於:

outputs/transcripts/.txt (Day8 產物)

outputs/mm/meeting_podcast/_summary.json

outputs/mm/meeting_podcast/_podcast.mp3


上一篇
文字轉語音(TTS, Text-to-Speech)
系列文
練習AI11
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言